### Replication Package for "Why is Intermediating Houses so Difficult? Evidence from iBuyers"
### Buchak, Matvos, Piskorski, and Seru
###
###
### buchak@stanford.edu

### Creates iBuyer pricing error tables.

library(data.table)
library(ggplot2)
library(lfe)
library(Hmisc)
library(zoo)
library(stargazer)
library(scales)

source('0_helper_functions.r')
setDTthreads(20)


Table_3_Panels_AB_Table_4_Appendix_5 <- function() {
  
  data.in <- loadData()
  
  
  ## Panel A (R2 Panel)
  # Remove large outliers
  data.analysis <- data.in[saleamount < 1000000 & land_sqft < 300000 & living_sqft < 10000 ]
  
  # 1. R2 Analysis (Table 3 Panel A)
  iBuyer.buys  <- data.analysis[iBuyer.buyer == 1 & iBuyer.seller == 0 & year >= 2013]
  iBuyer.sells <- data.analysis[iBuyer.buyer == 0 & iBuyer.seller == 1 & year >= 2013]
  other        <- data.analysis[iBuyer.buyer == 0 & iBuyer.seller == 0 & year >= 2013]
  
  # Used to partition into high- and low-iBuyer market share markets.
  byZip <- data.analysis[,j=list(m = mean(iBuyer,na.rm=T)),by='zip5']
  other.h <- data.analysis[iBuyer.buyer == 0 & iBuyer.seller == 0 & zip5 %in% byZip[m > quantile(byZip$m,.75,na.rm=T)]$zip5]
  other.l <- data.analysis[iBuyer.buyer == 0 & iBuyer.seller == 0 & zip5 %in% byZip[m < quantile(byZip$m,.75,na.rm=T)]$zip5]
  
  # Baseline hedonic model
  mm       =                     ' ~ age.bin  + size.bin + multistory  + airconditioning + garage + heating + quality '
  
  # Regress log price on characteristics and thing about r2.
  # Progressively more saturated models by sample
  m1.b <- felm(formula(paste('saleamount',mm)),data=iBuyer.buys )
  m1.s <- felm(formula(paste('saleamount',mm)),data=iBuyer.sells )
  m1.o <- felm(formula(paste('saleamount',mm)),data=other )
  m1.h <- felm(formula(paste('saleamount',mm)),data=other.h )
  m1.l <- felm(formula(paste('saleamount',mm)),data=other.l )
  m2.b <- felm(formula(paste('saleamount',mm,'|zip5')),data=iBuyer.buys )
  m2.s <- felm(formula(paste('saleamount',mm,'|zip5')),data=iBuyer.sells )
  m2.o <- felm(formula(paste('saleamount',mm,'|zip5')),data=other )
  m2.h <- felm(formula(paste('saleamount',mm,'|zip5')),data=other.h )
  m2.l <- felm(formula(paste('saleamount',mm,'|zip5')),data=other.l )
  m3.b <- felm(formula(paste('saleamount',mm,'|qtr')),data=iBuyer.buys )
  m3.s <- felm(formula(paste('saleamount',mm,'|qtr')),data=iBuyer.sells )
  m3.o <- felm(formula(paste('saleamount',mm,'|qtr')),data=other )
  m3.h <- felm(formula(paste('saleamount',mm,'|qtr')),data=other.h )
  m3.l <- felm(formula(paste('saleamount',mm,'|qtr')),data=other.l )
  m4.b <- felm(formula(paste('saleamount',mm,'|zip5+qtr')),data=iBuyer.buys )
  m4.s <- felm(formula(paste('saleamount',mm,'|zip5+qtr')),data=iBuyer.sells )
  m4.o <- felm(formula(paste('saleamount',mm,'|zip5+qtr')),data=other )
  m4.h <- felm(formula(paste('saleamount',mm,'|zip5+qtr')),data=other.h )
  m4.l <- felm(formula(paste('saleamount',mm,'|zip5+qtr')),data=other.l )
  m5.b <- felm(formula(paste('saleamount',mm,'|paste(zip5,qtr)')),data=iBuyer.buys )
  m5.s <- felm(formula(paste('saleamount',mm,'|paste(zip5,qtr)')),data=iBuyer.sells ) 
  m5.o <- felm(formula(paste('saleamount',mm,'|paste(zip5,qtr)')),data=other )
  m5.h <- felm(formula(paste('saleamount',mm,'|paste(zip5,qtr)')),data=other.h )
  m5.l <- felm(formula(paste('saleamount',mm,'|paste(zip5,qtr)')),data=other.l )
  
  t = data.table(Characteristics = c('Y','Y','Y','Y','Y'),FE = c('None','Zip','Qtr','Zip + Qtr','Zip x Qtr'),
                 iBuyer.buyer  = 1-c(sum(m1.b$residuals^2)/m1.b$TSS,sum(m2.b$residuals^2)/m2.b$TSS,sum(m3.b$residuals^2)/m3.b$TSS,sum(m4.b$residuals^2)/m4.b$TSS,sum(m5.b$residuals^2)/m5.b$TSS),
                 iBuyer.seller = 1-c(sum(m1.s$residuals^2)/m1.s$TSS,sum(m2.s$residuals^2)/m2.s$TSS,sum(m3.s$residuals^2)/m3.s$TSS,sum(m4.s$residuals^2)/m4.s$TSS,sum(m5.s$residuals^2)/m5.s$TSS),
                 other         = 1-c(sum(m1.o$residuals^2)/m1.o$TSS,sum(m2.o$residuals^2)/m2.o$TSS,sum(m3.o$residuals^2)/m3.o$TSS,sum(m4.o$residuals^2)/m4.o$TSS,sum(m5.o$residuals^2)/m5.o$TSS),
                 high          = 1-c(sum(m1.h$residuals^2)/m1.h$TSS,sum(m2.h$residuals^2)/m2.h$TSS,sum(m3.h$residuals^2)/m3.h$TSS,sum(m4.h$residuals^2)/m4.h$TSS,sum(m5.h$residuals^2)/m5.h$TSS),
                 low           = 1-c(sum(m1.l$residuals^2)/m1.l$TSS,sum(m2.l$residuals^2)/m2.l$TSS,sum(m3.l$residuals^2)/m3.l$TSS,sum(m4.l$residuals^2)/m4.l$TSS,sum(m5.l$residuals^2)/m5.l$TSS))
  names(t) <- c('Hedonic controls','Fixed effects','All markets: iBuyer buyer','All markets: iBuyer seller','All markets: No iBuyer involved','High iBuyer Market: No iBuyer involved','Other Markets: No iBuyer Involved')
  stargazer(t,summary=F,type='html',out='../out/tables/3A.html')
  
  
  
  ## Panel B (Pricing Error and Time-to-Sale)
  
  
  ## Filters (some of these Fixed-effects lack observations --> problematic in model estimation)
  data.analysis <- data.in[year >= 2006 & saleamount < 1000000 & land_sqft < 300000 & living_sqft < 10000 & !(heating %in% c('SP0','ST0')) & quality != 'QFA' & garage != '920'] 
  data.analysis[,zip3 := paste0('z',substr(zip5,1,3))]
  
  # Calculate last sale price 
  data.analysis <- data.analysis[order(pclidirisfrmtd,date)]
  data.analysis[,last.sale := shift(saleamount,1,type='lag'),by='pclidirisfrmtd']
  
  # Partition into training and testing. Training data is 2008-2012, make sure to exclude iBuyer sales (although there shouldn't be any)
  data.train <- data.analysis[year < 2013 & iBuyer.buyer == 0, c('age.bin','size.bin','multistory','zip5','qtr','saleamount'),with=F]
  data.train <- data.train[complete.cases(data.train)]
  
  # Test data
  data.test  <- data.analysis[year >= 2014 ]
  
  # Recover pricing residuals
  pricing.model <- felm(log(saleamount) ~ age.bin  + size.bin + multistory  | paste0(qtr,zip5),data = data.train)
  
  # Get absolute errors. These are what we forecast. Exclude extremely large ones (likely data errors)
  data.train[,e_raw := abs(pricing.model$residuals)]
  data.reg <- data.train[e_raw < 3] 
  
  # Model to predict error terms
  mm.error =                     ' ~ age.bin  + size.bin + multistory   '
  
  deviation.model <- lm(paste('e_raw',mm.error) ,data = data.reg)
  
  
  
  # Get predicted errors in test data.
  data.test[,predicted.e2 := predict(deviation.model,data.test)]
  
  # Build liquidity prediction model. These coefficients come from the MLS regressions (done separately on the NBER data server hence the manual codings.)
  data.test[,mr1 := log(last.sale)]
  data.test[,mr2 := log(last.sale)^2]
  data.test[,mr3 := log(last.sale)^3]
  data.test[,ls1 := log(land_sqft)]
  data.test[,ls2 := log(land_sqft)^2]
  data.test[,ls3 := log(land_sqft)^3]
  data.test[,is1 := log(living_sqft)]
  data.test[,is2 := log(living_sqft)^2]
  data.test[,is3 := log(living_sqft)^3]
  data.test[,ha1 := log(house.age)]
  data.test[,ha2 := log(house.age)^2]
  data.test[,ha3 := log(house.age)^3]
  
  # Create the model. Comes from MLS data regression run on NBER server
   data.test[,prediction := (-7.458e1 +
                              - 4.288e-1 * mr1 + 6.143e-2 * mr2 + -2.344e-3 * mr3 + 
                              + 1.184e-1 * ls1 + 8.913e-3 * ls2 + -1.688e-3 * ls3 + 
                              2.921e1  * is1 - 3.779e0  * is2 +  1.621e-1 * is3 + 
                              1.182e-2 * ha1 + 2.430e-2 * ha2 + -5.951e-3 * ha3 +
                              -1.147e-1 * multistory)]
 
  toReg <- data.test
  
  # Regress iBuyer market share on predictions
  qq3 <- felm(I(100*iBuyer.buyer) ~ predicted.e2|paste(zip5,qtr) |0|pclidirisfrmtd,data=toReg[year == 2018 ])
  qq6 <- felm(I(100*iBuyer.buyer) ~ prediction|paste(zip5,qtr)|0|pclidirisfrmtd,data=toReg[year == 2018  ])
  qq9 <- felm(I(100*iBuyer.buyer) ~ predicted.e2 + prediction|paste(zip5,qtr)|0|pclidirisfrmtd,data=toReg[year == 2018  ])
  qq2 <- felm(I(100*iBuyer.buyer) ~ predicted.e2|paste(zip5,qtr)|0|pclidirisfrmtd,data=toReg)
  qq5 <- felm(I(100*iBuyer.buyer) ~ prediction|paste(zip5,qtr)|0|pclidirisfrmtd,data=toReg)
  qq8 <- felm(I(100*iBuyer.buyer) ~ predicted.e2 + prediction|paste(zip5,qtr)|0|pclidirisfrmtd,data=toReg)
  
  stargazer(qq2,qq5,qq8,qq3,qq6,qq9,type='html',out = '../out/tables/3B.html',
            covariate.labels = c('E-hat','SellsWithin90Days-hat'),
            dep.var.labels = 'iBuyer buyer (%)',
            omit.stat = c('adj.rsq','f','ser'),
            add.lines = list(c('Sample','2014-2018','2014-2018','2014-2018','2018','2018','2018'),
                             c('Zip x Quarter FE','Y','Y','Y','Y','Y','Y')))
  
  toReg[,sales.bin := cut(prediction,breaks=seq(0,.9,.1))]
  jj <- toReg[prediction > .1 & !is.na(prediction),j=list(m1 = mean(iBuyer.buyer,na.rm=T),s=sqrt(1/.N * mean(iBuyer.buyer,na.rm=T) * (1-mean(iBuyer.buyer,na.rm=T)))),by=c('sales.bin')]
  ggplot(jj[!is.na(sales.bin)]) + geom_point(aes(x=sales.bin,y=m1)) + geom_errorbar(aes(x=sales.bin,ymin = m1 - 2*s,ymax = m1 + 2*s ),width=.5)  +
    theme_bw() + scale_y_continuous(labels = percent) + xlab('P(sells within 3 months of listing)') + ylab('iBuyer market share (%)')
  ggsave('../out/figures/1F.png',height=3,width=5,units = 'in') 
  
  # Make Chart
  toReg[,error.bin := ave(predicted.e2,cut2(predicted.e2,g = 4))]
  jj <- toReg[ ,j=list(m1 = mean(iBuyer.buyer,na.rm=T),s=sqrt(1/.N * mean(iBuyer.buyer,na.rm=T) * (1-mean(iBuyer.buyer,na.rm=T)))),by=c('error.bin')]
  jj[,error.bin := (error.bin - min(jj$error.bin,na.rm=T)) / min(jj$error.bin,na.rm=T)]
  ggplot(jj[!is.na(error.bin)]) + geom_point(aes(x=100*error.bin,y=m1)) + geom_errorbar(aes(x=100*error.bin,ymin = m1 - 2*s,ymax = m1 + 2*s ))  +
    theme_bw() + scale_y_continuous(labels = percent) + xlab('Predicted pricing error vs. baseline (%)') + ylab('iBuyer market share (%)')
  ggsave('../out/figures/1E.png',height=3,width=5,units = 'in') 
  
  ## Table 4: Do errors/liquidity matter for returns and time-on-market?
  data.predictions <- toReg[,c('pclidirisfrmtd','zip5','fipscode','iBuyer.buyer','iBuyer.seller','saleamount','qtr','prediction','predicted.e2'),with=F]
  data.predictions <- data.predictions[order(fipscode,zip5,pclidirisfrmtd,qtr)]
  data.predictions[,next.sale := shift(saleamount,type='lead'),by='pclidirisfrmtd']
  data.predictions[,next.sale.qtr := shift(qtr,type='lead'),by='pclidirisfrmtd']
  data.predictions[,holding.period := as.numeric(as.Date(next.sale.qtr) - as.Date(qtr))]
  data.predictions[,next.sale.ibuyer.seller := shift(iBuyer.seller,type='lead'),by='pclidirisfrmtd']
  
  iBuyer.only <- data.predictions[(iBuyer.buyer == 1 | next.sale.ibuyer.seller == 1) & abs(next.sale / saleamount - 1) < 2 & holding.period >= 0 & holding.period < 2 * 365]
  
  qq1 <- felm(I(100 * (next.sale / saleamount) / holding.period) ~ iBuyer.buyer*(predicted.e2 ) | paste(qtr,zip5)|0|pclidirisfrmtd,data=data.predictions[abs(next.sale / saleamount - 1) < 2 & holding.period > 0 & holding.period < 2*365])
  qq2 <- felm(I(100 * (next.sale / saleamount) / holding.period) ~ iBuyer.buyer*( prediction) | paste(qtr,zip5)|0|pclidirisfrmtd,data=data.predictions[abs(next.sale / saleamount - 1) < 2 & holding.period > 0 & holding.period < 2*365])
  qq3 <- felm(I(100 * (next.sale / saleamount) / holding.period) ~ iBuyer.buyer*(predicted.e2 + prediction) | paste(qtr,zip5)|0|pclidirisfrmtd,data=data.predictions[abs(next.sale / saleamount - 1) < 2 & holding.period > 0 & holding.period < 2*365])
  
  qq4 <- felm(I( holding.period / 365) ~ iBuyer.buyer*(predicted.e2 ) | paste(qtr,zip5)|0|pclidirisfrmtd,data=data.predictions[abs(next.sale / saleamount - 1) < 2 & holding.period > 0 & holding.period < 2*365])
  qq5 <- felm(I( holding.period / 365) ~ iBuyer.buyer*( prediction) | paste(qtr,zip5)|0|pclidirisfrmtd,data=data.predictions[abs(next.sale / saleamount - 1) < 2 & holding.period > 0 & holding.period < 2*365])
  qq6 <- felm(I( holding.period / 365) ~ iBuyer.buyer*(predicted.e2 + prediction) | paste(qtr,zip5)|0|pclidirisfrmtd,data=data.predictions[abs(next.sale / saleamount - 1) < 2 & holding.period > 0 & holding.period < 2*365])
  
  stargazer(qq1,qq2,qq3,qq4,qq5,qq6,type='html',no.space = T,out='../out/tables/4.html',
            covariate.labels = c('iBuyer','e-hat2','SellsWinthin90Days-hat','iBuyer x e-hat2','iBuyer x SellsWithin90Days-hat'),
            dep.var.labels = c('Annualized Gross Return','Holding period (years)'),
            omit.stat =  c('adj.rsq','f','ser'),
            add.lines = list(c('Zip x Quarter FE','Y','Y','Y','Y','Y','Y')))
  
  
  # Finally, robustness around the modeling years (2008-2012 instead of 2006-2012) (Appendix Table)
  data.train <- data.analysis[year < 2013 & year >= 2008 & iBuyer.buyer == 0, c('age.bin','size.bin','multistory','zip5','qtr','saleamount'),with=F]
  data.train <- data.train[complete.cases(data.train)]
  
  # Recover pricing residuals
  pricing.model.robustness <- felm(log(saleamount) ~ age.bin  + size.bin + multistory  | paste0(qtr,zip5),data = data.train)
  
  # Get absolute errors. These are what we forecast. Exclude extremely large ones (likely data errors)
  data.train[,e_raw := abs(pricing.model.robustness$residuals)]
  data.reg <- data.train[e_raw < 3] 
  
  # Model to predict error terms
  mm.error =                     ' ~ age.bin  + size.bin + multistory   '
  deviation.model.robustness <- lm(paste('e_raw',mm.error) ,data = data.reg)

  # Stargazer has trouble outputting these simultaneously for some reason so output them separately and combine manually.
  stargazer(pricing.model,deviation.model,type='html',out='../out/tables/A5a.html',add.lines = list(c('Other House Characteristics','Y','Y'),
                                                                                                    c('Sample','2006-2012','2006-2012')),
            omit.stat = c('adj.rsq','f','ser'),
            dep.var.labels = c('Log(house price)','Squared deviation from predicted price'))
  stargazer(pricing.model.robustness,deviation.model.robustness,type='html',out='../out/tables/A5b.html',add.lines = list(c('Other House Characteristics','Y','Y'),
                                                                                                                          c('Sample','2008-2012','2008-2012')),
            omit.stat = c('adj.rsq','f','ser'),
            dep.var.labels = c('Log(house price)','Squared deviation from predicted price'))
    
}


Table_3_Panels_AB_Table_4_Appendix_5()
